# Module 5 Demo: Functions, Vectors & Iteration
# Comprehensive examples for clinical programming applications

# ===========================
# SETUP: Load Required Packages
# ===========================

library(dplyr)
library(tibble)
library(lubridate)
library(stringr)
library(purrr)

cat("=== Module 5: Functions, Vectors & Iteration Demo ===\n")
cat("Based on R4DS Chapters 19, 20, and 21\n")
cat("Focus: Clinical programming applications\n\n")

# ===========================
# Part 1: Functions (R4DS Chapter 19)
# ===========================

cat("=== Part 1: Functions (R4DS Chapter 19) ===\n")

# When to write a function: DRY principle (Don't Repeat Yourself)
# Example: Instead of repeating normalization code

# BAD: Repetitive code
sample_data <- tibble(
  lab_value_1 = c(120, 85, 95, 110),
  lab_value_2 = c(2.1, 1.8, 2.3, 1.9),
  lab_value_3 = c(13.5, 12.1, 14.2, 13.8)
)

# Instead of:
# sample_data$lab_value_1 <- (sample_data$lab_value_1 - min(sample_data$lab_value_1, na.rm = TRUE)) /
#                            (max(sample_data$lab_value_1, na.rm = TRUE) - min(sample_data$lab_value_1, na.rm = TRUE))

# GOOD: Write a function
rescale01 <- function(x) {
  range_x <- range(x, na.rm = TRUE)
  (x - range_x[1]) / (range_x[2] - range_x[1])
}

# Apply the function
sample_data_normalized <- sample_data %>%
  mutate(across(everything(), rescale01))

cat("Original data:\n")
print(sample_data)
cat("\nNormalized data (0-1 scale):\n")
print(sample_data_normalized)

# Function components demonstration
clinical_age_category <- function(age, pediatric_cutoff = 18, elderly_cutoff = 65) {
  case_when(
    is.na(age) ~ "Unknown",
    age < pediatric_cutoff ~ "Pediatric",
    age >= pediatric_cutoff & age < elderly_cutoff ~ "Adult",
    age >= elderly_cutoff ~ "Elderly"
  )
}

# Examine function components (R4DS Ch. 19 concepts)
cat("\nFunction components:\n")
cat("Formals (arguments):\n")
print(formals(clinical_age_category))
cat("\nBody:\n")
print(body(clinical_age_category))
cat("\nEnvironment:\n")
print(environment(clinical_age_category))

# Test function with different arguments
test_ages <- c(16, 25, 45, 67, 72, NA)
cat("\nAge categories (default cutoffs):\n")
print(clinical_age_category(test_ages))

cat("\nAge categories (custom cutoffs - pediatric: 21, elderly: 60):\n")
print(clinical_age_category(test_ages, pediatric_cutoff = 21, elderly_cutoff = 60))

# ===========================
# Part 2: Vectors (R4DS Chapter 20)
# ===========================

cat("\n=== Part 2: Vectors (R4DS Chapter 20) ===\n")

# Vector basics - the building blocks of R
cat("Vector types in clinical data:\n")

# Atomic vector types
logical_vec <- c(TRUE, FALSE, TRUE, FALSE)     # Treatment response
integer_vec <- c(1L, 2L, 3L, 4L)               # Visit numbers
double_vec <- c(120.5, 85.2, 95.8, 110.1)     # Lab values (systolic BP)
character_vec <- c("M", "F", "M", "F")         # Gender
complex_vec <- c(1+2i, 3+4i)                   # Rarely used in clinical
raw_vec <- as.raw(c(65, 66, 67))               # Rarely used in clinical

cat("Logical vector:", typeof(logical_vec), "\n")
cat("Integer vector:", typeof(integer_vec), "\n")
cat("Double vector:", typeof(double_vec), "\n")
cat("Character vector:", typeof(character_vec), "\n")

# Vector coercion - important for clinical data processing
mixed_values <- c(1, 2, 3, "4", 5)
cat("\nMixed vector gets coerced to:", typeof(mixed_values), "\n")
print(mixed_values)

# Common clinical data coercion issues
lab_values_messy <- c("120", "85", "normal", "95", "high")
cat("\nAttempting to convert messy lab values to numeric:\n")
numeric_labs <- suppressWarnings(as.numeric(lab_values_messy))
print(numeric_labs)

# Safe coercion function for clinical data
safe_numeric_conversion <- function(x) {
  # Remove non-numeric words first
  cleaned <- str_replace_all(x, "normal|high|low", "")
  cleaned <- str_trim(cleaned)
  cleaned[cleaned == ""] <- NA_character_
  suppressWarnings(as.numeric(cleaned))
}

safer_labs <- safe_numeric_conversion(lab_values_messy)
cat("After safe conversion:\n")
print(safer_labs)

# Vector subsetting - crucial for clinical data manipulation
subject_ids <- c("001-001", "001-002", "001-003", "001-004", "001-005")
ages <- c(25, 67, 45, 72, 34)
names(ages) <- subject_ids

cat("\nSubsetting examples:\n")
cat("First and third subjects:", subject_ids[c(1, 3)], "\n")
cat("Exclude second and fourth:", subject_ids[-c(2, 4)], "\n")
cat("Elderly subjects (age >= 65):", subject_ids[ages >= 65], "\n")
cat("Ages by name:", ages[c("001-001", "001-004")], "\n")

# Create clinical demographics with proper vector types
dm <- tibble(
  USUBJID = paste0("001-", sprintf("%03d", 1:8)),
  AGE = c(25, 45, 67, 52, 71, 34, 58, 63),
  SEX = c("F", "M", "F", "M", "F", "F", "M", "F"),
  WEIGHT = c(65.5, 80.2, 58.7, 75.1, 62.3, 70.8, 85.4, 60.2),
  HEIGHT = c(160L, 175L, 155L, 180L, 158L, 165L, 185L, 162L),  # Integers
  RFSTDTC = "2024-01-15"
)

cat("\nDemographics data with proper vector types:\n")
print(dm)
cat("\nColumn types:\n")
print(sapply(dm, typeof))

# Function with vector type checking
derive_bmi_safe <- function(data, weight_var, height_var) {
  # Input validation using vector testing
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }

  data %>%
    mutate(
      BMI = case_when(
        is.na({{ weight_var }}) | is.na({{ height_var }}) ~ NA_real_,
        {{ height_var }} <= 0 | {{ weight_var }} <= 0 ~ NA_real_,
        TRUE ~ {{ weight_var }} / ({{ height_var }} / 100)^2
      ),
      BMI_CATEGORY = case_when(
        is.na(BMI) ~ "Unknown",
        BMI < 18.5 ~ "Underweight",
        BMI >= 18.5 & BMI < 25 ~ "Normal",
        BMI >= 25 & BMI < 30 ~ "Overweight",
        BMI >= 30 ~ "Obese"
      )
    )
}

dm_with_bmi <- dm %>%
  derive_bmi_safe(WEIGHT, HEIGHT)

cat("\nWith BMI calculated:\n")
print(dm_with_bmi %>% select(USUBJID, WEIGHT, HEIGHT, BMI, BMI_CATEGORY))

# ===========================
# Part 3: Iteration (R4DS Chapter 21)
# ===========================

cat("\n=== Part 3: Iteration (R4DS Chapter 21) ===\n")

# For loops - imperative programming approach
cat("For loops for clinical data processing:\n")

# Example 1: Calculate summary statistics for multiple numeric columns
numeric_vars <- c("AGE", "WEIGHT", "HEIGHT", "BMI")
dm_extended <- dm_with_bmi

# Pre-allocate results (important for performance)
summary_stats <- vector("list", length(numeric_vars))
names(summary_stats) <- numeric_vars

for (i in seq_along(numeric_vars)) {
  var_name <- numeric_vars[[i]]
  var_data <- dm_extended[[var_name]]

  if (is.numeric(var_data)) {
    summary_stats[[i]] <- list(
      variable = var_name,
      n = sum(!is.na(var_data)),
      mean = round(mean(var_data, na.rm = TRUE), 2),
      sd = round(sd(var_data, na.rm = TRUE), 2),
      min = min(var_data, na.rm = TRUE),
      max = max(var_data, na.rm = TRUE)
    )
  }
}

cat("Summary statistics using for loop:\n")
print(summary_stats)

# Example 2: Modifying existing objects
# Standardize multiple character columns
ae_data <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-002", "001-003"),
  AESEQ = c(1, 2, 1, 2, 1),
  AEDECOD = c("  HEADACHE ", "NAUSEA", "fatigue  ", "DIZZINESS", "RASH"),
  CMDECOD = c("aspirin  ", "  IBUPROFEN", "acetaminophen  ", "NSAID", "  topical cream"),
  AESTDTC = c("2024-01-20", "2024-01-25", "2024-01-18", "2024-01-22", "2024-01-26"),
  RFSTDTC = c("2024-01-15", "2024-01-15", "2024-01-16", "2024-01-16", "2024-01-15")
)

cat("\nOriginal AE data with messy text:\n")
print(ae_data)

# Use for loop to clean text columns
text_cols <- c("AEDECOD", "CMDECOD")

for (col in text_cols) {
  ae_data[[col]] <- ae_data[[col]] %>%
    str_trim() %>%
    str_to_upper() %>%
    str_replace_all("\\s+", " ")
}

cat("\nCleaned AE data:\n")
print(ae_data)

# Example 3: Unknown output length (while loop)
# Simulate dose escalation in clinical trial
dose_escalation_simulation <- function(starting_dose = 10, max_dose = 100, safety_prob = 0.8) {
  current_dose <- starting_dose
  doses <- current_dose
  escalation_step <- 1

  cat("Starting dose escalation simulation:\n")
  cat("Step", escalation_step, ": Dose =", current_dose, "mg\n")

  while (current_dose < max_dose) {
    # Simulate safety assessment (simplified)
    safety_ok <- rbinom(1, 1, prob = safety_prob)

    if (safety_ok) {
      current_dose <- min(current_dose * 1.5, max_dose)  # Escalate by 50%
      escalation_step <- escalation_step + 1
      cat("Step", escalation_step, ": Dose =", round(current_dose, 1), "mg (SAFE - Escalate)\n")
    } else {
      cat("Step", escalation_step + 1, ": SAFETY ISSUE - Stop escalation\n")
      break
    }

    doses <- c(doses, current_dose)

    # Safety check to prevent infinite loop
    if (length(doses) > 10) {
      cat("Maximum escalation steps reached\n")
      break
    }
  }

  return(doses)
}

escalation_doses <- dose_escalation_simulation()
cat("Final dose sequence:", escalation_doses, "\n")

# Functional programming approach - often cleaner
cat("\nFunctional programming with map():\n")

# Instead of for loop, use map for summary stats
library(purrr)

summary_functional <- dm_extended %>%
  select(where(is.numeric)) %>%
  map_dfr(~ tibble(
    n = sum(!is.na(.x)),
    mean = round(mean(.x, na.rm = TRUE), 2),
    sd = round(sd(.x, na.rm = TRUE), 2),
    min = min(.x, na.rm = TRUE),
    max = max(.x, na.rm = TRUE)
  ), .id = "variable")

cat("Summary statistics using map():\n")
print(summary_functional)
# ===========================
# Part 4: SAS Macro Translation Using R4DS Concepts
# ===========================

cat("\n=== Part 4: SAS Macro Translation Using R4DS Concepts ===\n")

# SAS macro example translated to R function with proper vector handling
calc_study_day <- function(data, event_date, ref_date, new_var = "STUDY_DAY") {
  data %>%
    mutate(
      !!new_var := case_when(
        is.na(ymd({{ event_date }})) | is.na(ymd({{ ref_date }})) ~ NA_real_,
        ymd({{ event_date }}) >= ymd({{ ref_date }}) ~ as.numeric(ymd({{ event_date }}) - ymd({{ ref_date }})) + 1,
        ymd({{ event_date }}) < ymd({{ ref_date }}) ~ as.numeric(ymd({{ event_date }}) - ymd({{ ref_date }}))
      )
    )
}

# Apply study day calculation
ae_with_studyday <- ae_data %>%
  calc_study_day(AESTDTC, RFSTDTC, "AESTDY")

cat("With study day calculated:\n")
print(ae_with_studyday)

# Function with proper error handling (R4DS Ch. 19 concepts)
safe_calc_study_day <- function(data, event_date, ref_date, new_var = "STUDY_DAY") {
  # Input validation
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }

  event_col <- rlang::as_name(rlang::enquo(event_date))
  ref_col <- rlang::as_name(rlang::enquo(ref_date))

  if (!event_col %in% names(data)) {
    stop(paste("Column", event_col, "not found in data"))
  }

  if (!ref_col %in% names(data)) {
    stop(paste("Column", ref_col, "not found in data"))
  }

  # Apply calculation
  result <- data %>%
    mutate(
      !!new_var := case_when(
        is.na(ymd({{ event_date }})) | is.na(ymd({{ ref_date }})) ~ NA_real_,
        ymd({{ event_date }}) >= ymd({{ ref_date }}) ~ as.numeric(ymd({{ event_date }}) - ymd({{ ref_date }})) + 1,
        ymd({{ event_date }}) < ymd({{ ref_date }}) ~ as.numeric(ymd({{ event_date }}) - ymd({{ ref_date }}))
      )
    )

  # Validation of results
  n_missing <- sum(is.na(result[[new_var]]))
  if (n_missing > 0) {
    warning(paste(n_missing, "rows had missing study day calculations"))
  }

  return(result)
}

# Test error handling
cat("\nTesting safe function with error handling:\n")
ae_safe <- ae_data %>%
  safe_calc_study_day(AESTDTC, RFSTDTC, "AESTDY_SAFE")

print(ae_safe %>% select(USUBJID, AESTDTC, RFSTDTC, AESTDY_SAFE))

# ===========================
# Part 5: Advanced Functional Programming (map family)
# ===========================

cat("\n=== Part 5: Advanced Functional Programming (map family) ===\n")

# map() family for different return types
clinical_studies <- list(
  study_a = c(45, 67, 34, 52, 71, 38),
  study_b = c(28, 49, 63, 41, 55, 44),
  study_c = c(39, 58, 46, 62, 33, 54)
)

# map() returns a list
mean_ages_list <- clinical_studies %>%
  map(mean)
cat("Mean ages (list):\n")
print(mean_ages_list)

# map_dbl() returns a numeric vector
mean_ages_numeric <- clinical_studies %>%
  map_dbl(mean)
cat("\nMean ages (numeric vector):\n")
print(mean_ages_numeric)

# map_chr() returns a character vector
age_summaries <- clinical_studies %>%
  map_chr(~ paste("Mean:", round(mean(.x), 1), "| SD:", round(sd(.x), 1)))
cat("\nAge summaries (character vector):\n")
print(age_summaries)

# Anonymous functions and shortcuts
cat("\nThree ways to write the same computation:\n")

# 1. Traditional anonymous function
result1 <- clinical_studies %>%
  map(function(x) mean(x) + 2 * sd(x))

# 2. Formula shortcut (purrr style)
result2 <- clinical_studies %>%
  map(~ mean(.x) + 2 * sd(.x))

# 3. Using existing function with additional arguments
result3 <- clinical_studies %>%
  map(~ quantile(.x, probs = 0.95))

cat("Upper control limits (mean + 2*SD):\n")
print(result2)

# map2() for two inputs - BMI calculation
weights <- c(70, 65, 80, 75, 68)
heights <- c(175, 160, 185, 180, 155)

bmis <- map2_dbl(weights, heights, ~ .x / (.y / 100)^2)
cat("\nBMI calculations using map2_dbl():\n")
print(round(bmis, 1))

# pmap() for multiple inputs
patient_data <- list(
  weight = c(70, 65, 80, 75),
  height = c(175, 160, 185, 180),
  age = c(45, 62, 38, 55),
  sex = c("M", "F", "M", "F")
)

# Function that uses all inputs
calculate_health_score <- function(weight, height, age, sex) {
  bmi <- weight / (height / 100)^2
  age_factor <- ifelse(age > 50, 0.9, 1.0)
  sex_factor <- ifelse(sex == "M", 1.0, 0.95)
  score <- (25 / bmi) * age_factor * sex_factor
  round(score, 2)
}

health_scores <- pmap_dbl(patient_data, calculate_health_score)
cat("\nHealth scores using pmap_dbl():\n")
print(health_scores)

# Processing multiple clinical datasets
study_datasets <- list(
  study_001 = tibble(
    USUBJID = paste0("001-", 1:4),
    AGE = c(45, 67, 34, 52),
    WEIGHT = c(70, 65, 80, 75)
  ),
  study_002 = tibble(
    USUBJID = paste0("002-", 1:3),
    AGE = c(28, 49, 63),
    WEIGHT = c(68, 72, 58)
  ),
  study_003 = tibble(
    USUBJID = paste0("003-", 1:5),
    AGE = c(39, 58, 46, 62, 33),
    WEIGHT = c(74, 69, 77, 61, 73)
  )
)

# Process each study with same function
process_study_data <- function(data) {
  data %>%
    mutate(
      BMI = WEIGHT / (1.70^2),  # Assume height = 170cm
      AGE_GROUP = case_when(
        AGE < 40 ~ "Young",
        AGE >= 40 & AGE < 60 ~ "Middle",
        AGE >= 60 ~ "Senior"
      ),
      ELDERLY_FLAG = ifelse(AGE >= 65, "Y", "N")
    )
}

processed_studies <- study_datasets %>%
  map(process_study_data)

cat("\nProcessed studies:\n")
iwalk(processed_studies, ~ {
  cat("Study:", .y, "\n")
  print(.x)
  cat("\n")
})

# ===========================
# Part 6: Advanced Function Concepts (Function Factories & Error Handling)
# ===========================

cat("\n=== Part 6: Advanced Function Concepts ===\n")

# Function factories - functions that create other functions
create_domain_validator <- function(domain_prefix, min_length = 3) {
  function(subject_id) {
    if (nchar(subject_id) < min_length) {
      return(FALSE)
    }
    str_detect(subject_id, paste0("^", domain_prefix, "-"))
  }
}

# Create specific validators
validate_ae_subject <- create_domain_validator("AE")
validate_dm_subject <- create_domain_validator("DM")
validate_vs_subject <- create_domain_validator("VS")

# Test the validators
test_subjects <- c("AE-001", "DM-002", "VS-003", "AE-004", "XX-005")
cat("AE subject validation:\n")
print(map_lgl(test_subjects, validate_ae_subject))

# Function with multiple return options
analyze_clinical_data <- function(data, return_type = "summary", group_var = NULL) {

  if (!is.null(group_var)) {
    base_analysis <- data %>%
      group_by({{ group_var }}) %>%
      summarise(
        n_subjects = n_distinct(USUBJID),
        mean_age = round(mean(AGE, na.rm = TRUE), 1),
        median_age = median(AGE, na.rm = TRUE),
        sd_age = round(sd(AGE, na.rm = TRUE), 1),
        .groups = "drop"
      )
  } else {
    base_analysis <- data %>%
      summarise(
        n_subjects = n_distinct(USUBJID),
        mean_age = round(mean(AGE, na.rm = TRUE), 1),
        median_age = median(AGE, na.rm = TRUE),
        sd_age = round(sd(AGE, na.rm = TRUE), 1)
      )
  }

  switch(return_type,
    "summary" = base_analysis,
    "detailed" = base_analysis %>%
      mutate(
        age_range = paste(min(data$AGE, na.rm = TRUE), max(data$AGE, na.rm = TRUE), sep = "-"),
        cv_age = round(sd_age / mean_age * 100, 1)
      ),
    "count_only" = if(is.null(group_var)) base_analysis$n_subjects else sum(base_analysis$n_subjects),
    "ages_only" = data$AGE
  )
}

# Test with demographics data
cat("\nAnalysis of demographics data:\n")
cat("Summary:\n")
print(analyze_clinical_data(dm_with_bmi, "summary"))

cat("\nDetailed:\n")
print(analyze_clinical_data(dm_with_bmi, "detailed"))

cat("\nGrouped by BMI category:\n")
print(analyze_clinical_data(dm_with_bmi, "summary", BMI_CATEGORY))

# Error handling with possibly() and safely()
risky_function <- function(x) {
  if (x < 0) stop("x must be positive")
  sqrt(x)
}

# Using possibly() for graceful error handling
safe_sqrt <- possibly(risky_function, otherwise = NA)

test_values <- c(4, 9, -1, 16, -5, 25)
results <- map_dbl(test_values, safe_sqrt)
cat("\nSafe square root results:\n")
print(results)

# Using safely() to capture both result and error
safe_sqrt_detailed <- safely(risky_function)
detailed_results <- map(test_values, safe_sqrt_detailed)

cat("\nDetailed safe results (first few):\n")
print(detailed_results[1:3])

# ===========================
# Part 7: Predicate Functions and Advanced Iteration
# ===========================

cat("\n=== Part 7: Predicate Functions and Advanced Iteration ===\n")

# keep() and discard() for filtering
lab_results <- list(
  glucose = c(90, 95, 110, 85, 92),
  creatinine = c(1.1, 0.9, 1.3, 1.0, 1.2),
  invalid_test = c(NA, NA, NA, NA, NA),
  hemoglobin = c(13.5, 12.1, 14.2, 13.8, 12.9),
  empty_test = numeric(0)
)

# Keep only valid lab tests (non-empty and not all NA)
valid_labs <- lab_results %>%
  keep(~ length(.x) > 0 && !all(is.na(.x)))

cat("Valid lab tests:\n")
print(names(valid_labs))

# Find lab tests with potential abnormal values (simplified thresholds)
abnormal_labs <- lab_results %>%
  keep(~ any(.x > 15 | .x < 0.5, na.rm = TRUE))

cat("Labs with potential abnormal values:\n")
print(names(abnormal_labs))

# reduce() for cumulative operations
daily_ae_counts <- c(2, 1, 3, 0, 2, 1, 4, 2, 1, 0)

# Calculate cumulative AE count over study period
cumulative_ae <- accumulate(daily_ae_counts, `+`)
cat("\nDaily AE counts:", daily_ae_counts, "\n")
cat("Cumulative AE counts:", cumulative_ae, "\n")

# Combine multiple study datasets using reduce()
multi_study_data <- list(
  tibble(USUBJID = c("A001", "A002"), ARM = "Treatment", STUDY = "001"),
  tibble(USUBJID = c("B001", "B002"), ARM = "Placebo", STUDY = "002"),
  tibble(USUBJID = c("C001", "C002"), ARM = "Treatment", STUDY = "003")
)

# Combine all studies
pooled_data <- reduce(multi_study_data, bind_rows)
cat("\nPooled study data:\n")
print(pooled_data)

# walk() for side effects (reports, plots, file outputs)
create_study_summary <- function(data, study_name) {
  n_subjects <- nrow(data)
  treatments <- unique(data$ARM)

  cat("=== Study", study_name, "Summary ===\n")
  cat("Number of subjects:", n_subjects, "\n")
  cat("Treatment arms:", paste(treatments, collapse = ", "), "\n")
  cat("Subject IDs:", paste(data$USUBJID, collapse = ", "), "\n\n")
}

# Generate summaries for each study (side effect)
cat("Study summaries:\n")
walk2(multi_study_data, c("001", "002", "003"), create_study_summary)

# ===========================
# Part 8: Complete Clinical Data Processing Pipeline
# ===========================

cat("\n=== Part 8: Complete Clinical Data Processing Pipeline ===\n")

# Create a comprehensive function that combines all R4DS concepts
create_clinical_analysis_pipeline <- function(raw_data,
                                            age_cutoff = 65,
                                            weight_unit = "kg",
                                            return_summary = TRUE) {

  # Input validation (R4DS Ch. 19 - Functions)
  if (!is.data.frame(raw_data)) {
    stop("raw_data must be a data frame")
  }

  required_cols <- c("USUBJID", "AGE", "WEIGHT", "HEIGHT")
  missing_cols <- setdiff(required_cols, names(raw_data))
  if (length(missing_cols) > 0) {
    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
  }

  # Data processing using vectors and functions (R4DS Ch. 19, 20)
  processed_data <- raw_data %>%
    mutate(
      # Derive elderly flag
      ELDERLY = case_when(
        is.na(AGE) ~ "Unknown",
        AGE >= age_cutoff ~ "Yes",
        TRUE ~ "No"
      ),

      # Calculate BMI with proper vector handling
      BMI = case_when(
        is.na(WEIGHT) | is.na(HEIGHT) ~ NA_real_,
        WEIGHT <= 0 | HEIGHT <= 0 ~ NA_real_,
        TRUE ~ WEIGHT / (HEIGHT / 100)^2
      ),

      # BMI categories
      BMI_CATEGORY = case_when(
        is.na(BMI) ~ "Unknown",
        BMI < 18.5 ~ "Underweight",
        BMI >= 18.5 & BMI < 25 ~ "Normal",
        BMI >= 25 & BMI < 30 ~ "Overweight",
        BMI >= 30 ~ "Obese"
      ),

      # Age groups
      AGE_GROUP = case_when(
        is.na(AGE) ~ "Unknown",
        AGE < 40 ~ "Young Adult",
        AGE >= 40 & AGE < 65 ~ "Middle Age",
        AGE >= 65 ~ "Senior"
      )
    )

  # Generate summary using iteration (R4DS Ch. 21)
  if (return_summary) {
    summary_stats <- processed_data %>%
      summarise(
        n_subjects = n(),
        mean_age = round(mean(AGE, na.rm = TRUE), 1),
        sd_age = round(sd(AGE, na.rm = TRUE), 1),
        mean_bmi = round(mean(BMI, na.rm = TRUE), 1),
        n_elderly = sum(ELDERLY == "Yes", na.rm = TRUE),
        pct_elderly = round(n_elderly / n_subjects * 100, 1)
      )

    # Count by categories using map-like approach
    age_group_counts <- processed_data %>%
      count(AGE_GROUP, name = "n") %>%
      mutate(percentage = round(n / sum(n) * 100, 1))

    bmi_category_counts <- processed_data %>%
      count(BMI_CATEGORY, name = "n") %>%
      mutate(percentage = round(n / sum(n) * 100, 1))

    cat("=== CLINICAL DATA ANALYSIS SUMMARY ===\n")
    cat("Overall Statistics:\n")
    print(summary_stats)
    cat("\nAge Group Distribution:\n")
    print(age_group_counts)
    cat("\nBMI Category Distribution:\n")
    print(bmi_category_counts)

    return(list(
      data = processed_data,
      summary = summary_stats,
      age_groups = age_group_counts,
      bmi_categories = bmi_category_counts
    ))
  } else {
    return(processed_data)
  }
}

# Test the comprehensive pipeline
test_clinical_data <- tibble(
  USUBJID = paste0("SUB-", sprintf("%03d", 1:10)),
  AGE = c(25, 45, 67, 52, 71, 34, 58, 63, 29, 76),
  WEIGHT = c(65, 80, 58, 75, 62, 70, 85, 60, 78, 55),
  HEIGHT = c(160, 175, 155, 180, 158, 165, 185, 162, 172, 150),
  SEX = c("F", "M", "F", "M", "F", "F", "M", "F", "M", "F")
)

# Run the complete pipeline
pipeline_results <- create_clinical_analysis_pipeline(test_clinical_data)

# Apply to multiple studies using map() (R4DS Ch. 21)
multiple_studies <- list(
  study_A = test_clinical_data %>% mutate(USUBJID = paste0("A-", sprintf("%03d", 1:10))),
  study_B = test_clinical_data %>%
    mutate(
      USUBJID = paste0("B-", sprintf("%03d", 1:10)),
      AGE = AGE + sample(-5:5, 10, replace = TRUE),
      WEIGHT = WEIGHT + sample(-10:10, 10, replace = TRUE)
    ),
  study_C = test_clinical_data %>%
    mutate(
      USUBJID = paste0("C-", sprintf("%03d", 1:10)),
      AGE = AGE + sample(-3:8, 10, replace = TRUE),
      WEIGHT = WEIGHT + sample(-8:12, 10, replace = TRUE)
    )
)

cat("\n=== PROCESSING MULTIPLE STUDIES ===\n")

# Process all studies and extract summary statistics
study_summaries <- multiple_studies %>%
  map(~ create_clinical_analysis_pipeline(.x, return_summary = FALSE)) %>%
  map_dfr(~ summarise(.x,
    n_subjects = n(),
    mean_age = round(mean(AGE, na.rm = TRUE), 1),
    mean_bmi = round(mean(BMI, na.rm = TRUE), 1),
    pct_elderly = round(sum(ELDERLY == "Yes") / n() * 100, 1)
  ), .id = "study")

cat("Summary across all studies:\n")
print(study_summaries)

# ===========================
# Part 9: GitHub Copilot in RStudio Practice
# ===========================

cat("\n=== Part 9: GitHub Copilot in RStudio Practice ===\n")
cat("Try writing these comments in RStudio and see what Copilot suggests:\n\n")

# Example 1: Let Copilot help with function creation
cat("# Create function to flag subjects with multiple adverse events\n")
flag_multiple_aes <- function(data) {
  # Copilot should suggest something like:
  data %>%
    group_by(USUBJID) %>%
    summarise(n_aes = n(), .groups = "drop") %>%
    mutate(multiple_aes_flag = ifelse(n_aes > 1, "Y", "N"))
}

# Example 2: Copilot for mathematical functions
cat("# Function to calculate percent change from baseline\n")
calc_percent_change <- function(baseline, post_baseline) {
  # Copilot should suggest:
  ((post_baseline - baseline) / baseline) * 100
}

# Example 3: Complex clinical derivations
cat("# Derive safety population flag based on treatment exposure\n")
derive_safety_flag <- function(data, min_exposure_days = 1) {
  # Copilot should suggest exposure-based logic
  data %>%
    mutate(
      safety_flag = case_when(
        is.na(EXPOSURE_DAYS) ~ "N",
        EXPOSURE_DAYS >= min_exposure_days ~ "Y",
        TRUE ~ "N"
      )
    )
}

# Example 4: Comprehensive data processing
cat("# Function to create analysis-ready dataset with all derivations\n")
create_analysis_ready <- function(raw_data) {
  # Copilot should suggest a pipeline approach
  raw_data %>%
    # Data cleaning
    mutate(across(where(is.character), str_trim)) %>%
    # Derivations
    mutate(
      ELDERLY = ifelse(AGE >= 65, "Y", "N"),
      BMI = WEIGHT / (HEIGHT / 100)^2
    ) %>%
    # Validation flags
    mutate(
      data_complete = case_when(
        is.na(AGE) | is.na(WEIGHT) | is.na(HEIGHT) ~ "N",
        TRUE ~ "Y"
      )
    )
}

cat("\nCopilot Pro Tips:\n")
cat("1. Use descriptive variable names and clinical context\n")
cat("2. Write clear comments describing expected functionality\n")
cat("3. Include parameter types and expected outputs\n")
cat("4. Mention CDISC standards when relevant\n")
cat("5. Specify error handling requirements\n")

# ===========================
# SUMMARY AND NEXT STEPS
# ===========================

cat("\n=== Module 5 Demo Complete! ===\n")
cat("R4DS Integration Summary:\n")
cat("✓ Functions (Ch. 19): Creation, components, arguments, error handling\n")
cat("✓ Vectors (Ch. 20): Types, coercion, subsetting, safe operations\n")
cat("✓ Iteration (Ch. 21): for loops, while loops, map() family, functional programming\n")
cat("\nClinical Programming Applications:\n")
cat("✓ SAS macro translation to R functions\n")
cat("✓ Vectorized operations for efficient data processing\n")
cat("✓ Error handling and input validation\n")
cat("✓ Batch processing of multiple clinical datasets\n")
cat("✓ Advanced functional programming patterns\n")
cat("✓ GitHub Copilot integration for AI-assisted development\n")
cat("\nReady for hands-on practice in the exercise!\n")
